blah blah
This file belongs to the repository: https://github.com/drisso/awst_analysis.
The code is released with license GPL v3.0.
awstif (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("drisso/awst")
#BiocManager::install("GIS-SP-Group/RCA")
#BiocManager::install("hgu133plus2.db")
rm(list = ls())
#library(steFunctions)
library(dendextend)
library(clues)
#setwd("~/Dropbox/AWST/mixology/")
jobName <- "mixoloy20200818"
source(url("https://raw.githubusercontent.com/drisso/awst_analysis/master/functions.R"))
#####
save_plots <- FALSE
png_width_large <- 2100
png_height_large <- 500
png_width_small <- width_png <- 700
png_height_small <- 700
png_res <- 1/300
####
true_number_of_clusters <- c(3, 5, 7, 10, 10)
names(true_number_of_clusters) <- c("sc_10x", "sc_10x_5cl", "RNAmix_celseq2", "cellmix4", "cellmix3")
####
results <- matrix(NA, ncol = 8, nrow = 10)
colnames(results) <- c("where", "what", "ARI", "accuracy", "purity", "avg", "noOfClust_Est", "noOfClust_Th")
#results <- data.frame(where = rep("", 30), what = "", ARI = NA, accuracy = NA, purity = NA, noOfClust = NA)
k <- 0
con <- gzcon(url("https://github.com/LuyiTian/sc_mixology/blob/master/data/csv/sc_10x_5cl.metadata.csv.gz?raw=true"))
annotation.df <- read.csv(textConnection(readLines(con)))
annotation.df$cell.col <- factor(annotation.df$cell_line)
levels(annotation.df$cell.col) <- c("gold", "red", "blue", "magenta", "green3")
annotation.df$cell.col <- as.character(annotation.df$cell.col)
annotation.df$cell_line <- paste0("mix", annotation.df$mix)
con <- gzcon(url("https://github.com/LuyiTian/sc_mixology/blob/master/data/csv/sc_10x_5cl.count.csv.gz?raw=true"))
ddata <- read.csv(textConnection(readLines(con)))
both <- intersect(colnames(ddata), rownames(annotation.df))
ddata <- ddata[, both]
annotation.df <- annotation.df[both,]
prefix <- "sc_10x_5cl"
save(ddata, annotation.df, prefix, file = paste0(prefix, "_counts.RData"))
require(awst)
require(EDASeq)
require(Rtsne)
require(umap)
require(SingleCellExperiment)
require(clusterExperiment)
############################
#load(paste0(prefix, "_counts.RData"))
#load(paste0(prefix, "_expression.RData"))
#######################
no_of_detected_gene_per_sample <- colSums(ddata > 0)
fivenum(no_of_detected_gene_per_sample)
ddata <- EDASeq::betweenLaneNormalization(as.matrix(ddata), which = "full", round = FALSE)
# apply the AWS-transformation
tmp <- rowSums(ddata)
sum(tmp > 0)
tmp <- colSums(ddata)
sum(tmp > 0)
exprData <- awst(ddata, poscount = TRUE, full_quantile = TRUE)
sum(is.na(rowSums(exprData)))
sum(is.na(colSums(exprData)))
save(exprData, prefix, file = paste0(prefix, "_expression.RData"))
#dim(exprData <- gene_filter(exprData))
#write.table(exprData, file = paste0(prefix, "_expression.tsv"), sep = "\t")
nrow_exprData <- nrow(exprData)
ncol_exprData <- ncol(exprData)
library(irlba)
pprcomp <- prcomp_irlba(exprData, 5) # Run PC analysis
set.seed(2020)
ans_Rtsne <- Rtsne(exprData, pca = FALSE) # Run TSNE
set.seed(2020)
ans_umap <- umap(exprData) # Run Umap
#ans_RaceID <- ans_RaceID_raw <- NULL
#if(prefix %in% c("RNAmix_celseq2", "cellmix3", "cellmix4")) {
require(SingleCellExperiment)
if(FALSE) {
tmp <- get_RCA()
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_RCA_raw <- colData(tmp)[, -wwhich]
tmp <- get_RCA(is_awst = TRUE)
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_RCA_awst <- colData(tmp)[, -wwhich]
tmp <- get_RaceID()
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_RaceID_raw <- colData(tmp)[, -wwhich]
tmp <- get_RaceID(is_awst = TRUE)
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_RaceID_awst <- colData(tmp)[, -wwhich]
tmp <- get_SC3()
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_SC3_raw <- colData(tmp)[, -wwhich]
tmp <- get_SC3(is_awst = TRUE)
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_SC3_awst <- colData(tmp)[, -wwhich]
tmp <- get_Seurat()
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_Seurat_raw_LoR <- colData(tmp)[, -wwhich]
tmp <- get_Seurat(is_awst = TRUE)
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_Seurat_awst_LoR <- colData(tmp)[, -wwhich]
tmp <- get_Seurat(resolution = 1.6)
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_Seurat_raw_HiR <- colData(tmp)[, -wwhich]
tmp <- get_Seurat(is_awst = TRUE, resolution = 1.6)
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_Seurat_awst_HiR <- colData(tmp)[, -wwhich]
}
tmp <- get_clusterExperiment()
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_clusterExp_raw <- colData(tmp)[, -wwhich, drop=FALSE]
tmp <- get_clusterExperiment(is_awst = TRUE)
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_clusterExp_awst <- colData(tmp)[, -wwhich, drop=FALSE]
save(nrow_exprData, ncol_exprData, annotation.df,
pprcomp, ans_Rtsne, ans_umap, prefix,
ans_clusterExp_awst, ans_clusterExp_raw,
file = paste0(prefix, "_expression_working.RData"))
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## estimated no. of clusters: 40
## adjusted Rand-index: 0.8362
## cluster purity: 0.2468
## cluster accuracy: 0.0503
## estimated no. of clusters: 28
## adjusted Rand-index: 0.8377
## cluster purity: 0.221
## cluster accuracy: 0.0238
## estimated no. of clusters: 25
## adjusted Rand-index: 0.725
## cluster purity: 0.3336
## cluster accuracy: 0.0136
## estimated no. of clusters: 22
## adjusted Rand-index: 0.7413
## cluster purity: 0.2962
## cluster accuracy: 0.0091
## estimated no. of clusters: 19
## adjusted Rand-index: 0.8868
## cluster purity: 0.3371
## cluster accuracy: 0.1493
## estimated no. of clusters: 12
## adjusted Rand-index: 0.9007
## cluster purity: 0.1937
## cluster accuracy: 0.1863
##
## 009 018 027 036 045 054 063 072 081 090 108 111 117 171 180 207 225 252 270 306
## 18 4 4 5 4 4 4 4 3 10 5 2 18 18 4 5 20 17 4 5
## 333 360 405 450 504 522 540 603 630 702 711 720 801 810 900
## 30 5 3 4 5 17 4 5 2 4 16 4 4 5 19
##
## balanced H1975 H1975High H1975Low H2228 H2228High H2228Low
## 32 19 33 33 10 34 33
## HCC827 HCC827High HCC827Low
## 18 37 36
## Loading required package: awst
## Loading required package: EDASeq
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: ShortRead
## Loading required package: BiocParallel
## Loading required package: Biostrings
## Loading required package: IRanges
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:dendextend':
##
## nnodes
## The following object is masked from 'package:base':
##
## strsplit
## Loading required package: Rsamtools
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GenomicAlignments
## Loading required package: SummarizedExperiment
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
## Loading required package: Rtsne
## Loading required package: umap
## Loading required package: SingleCellExperiment
## Loading required package: clusterExperiment
##
## Attaching package: 'clusterExperiment'
## The following object is masked from 'package:clues':
##
## plotClusters
## J2 G6 P19 M12 E17
## 1604 2660 3371 4205 8325
## [1] 14052
## [1] 285
## [1] 0
## [1] 0
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
## Note: Merging will be done on ' makeConsensus ', with clustering index 1
## Warning: Zero sample variances detected, have been offset away from zero
## Note: Merging will be done on ' makeConsensus ', with clustering index 1
## estimated no. of clusters: 11
## adjusted Rand-index: 0.8274
## cluster purity: 0.3762
## cluster accuracy: 0.4554
## estimated no. of clusters: 2
## adjusted Rand-index: 0.4204
## cluster purity: 0.1182
## cluster accuracy: 0.4228
##
## balanced H1975 H1975High H1975Low H2228 H2228High H2228Low
## 33 19 33 33 10 34 34
## HCC827 HCC827High HCC827Low
## 18 38 36
## H9 H16 C10 A19 I6
## 2382.0 4022.0 4914.5 5906.0 11816.0
## [1] 15146
## [1] 288
## [1] 0
## [1] 0
## Note: Merging will be done on ' makeConsensus ', with clustering index 1
## Warning: Zero sample variances detected, have been offset away from zero
## Note: Merging will be done on ' makeConsensus ', with clustering index 1
## estimated no. of clusters: 13
## adjusted Rand-index: 0.8412
## cluster purity: 0.4182
## cluster accuracy: 0.458
## estimated no. of clusters: 2
## adjusted Rand-index: 0.3134
## cluster purity: 0.0854
## cluster accuracy: 0.4259
| where | what | ARI | accuracy | purity | avg | noOfClust_Est | noOfClust_Th |
|---|---|---|---|---|---|---|---|
| sc_10x_5cl | clusterExperiment + counts | 0.8362 | 0.0503 | 0.2468 | 0.8426 | 40 | 5 |
| sc_10x_5cl | clusterExperiment + AWST | 0.8377 | 0.0238 | 0.2210 | 0.8604 | 28 | 5 |
| sc_10x | clusterExperiment + counts | 0.7250 | 0.0136 | 0.3336 | 0.7811 | 25 | 3 |
| sc_10x | clusterExperiment + AWST | 0.7413 | 0.0091 | 0.2962 | 0.8026 | 22 | 3 |
| RNAmix_celseq2 | clusterExperiment + counts | 0.8868 | 0.1493 | 0.3371 | 0.7937 | 19 | 7 |
| RNAmix_celseq2 | clusterExperiment + AWST | 0.9007 | 0.1863 | 0.1937 | 0.8392 | 12 | 7 |
| cellmix3 | clusterExperiment + counts | 0.8274 | 0.4554 | 0.3762 | 0.6551 | 11 | 10 |
| cellmix3 | clusterExperiment + AWST | 0.4204 | 0.4228 | 0.1182 | 0.5981 | 2 | 10 |
| cellmix4 | clusterExperiment + counts | 0.8412 | 0.4580 | 0.4182 | 0.6425 | 13 | 10 |
| cellmix4 | clusterExperiment + AWST | 0.3134 | 0.4259 | 0.0854 | 0.548 | 2 | 10 |
(*) with at least a cluster of size 1.
sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] irlba_2.3.3 Matrix_1.2-18
## [3] clusterExperiment_2.8.0 SingleCellExperiment_1.10.1
## [5] umap_0.2.5.0 Rtsne_0.15
## [7] EDASeq_2.22.0 ShortRead_1.46.0
## [9] GenomicAlignments_1.24.0 SummarizedExperiment_1.18.2
## [11] DelayedArray_0.14.1 matrixStats_0.56.0
## [13] Rsamtools_2.4.0 GenomicRanges_1.40.0
## [15] GenomeInfoDb_1.24.2 Biostrings_2.56.0
## [17] XVector_0.28.0 IRanges_2.22.2
## [19] BiocParallel_1.22.0 Biobase_2.48.0
## [21] awst_0.0.4 S4Vectors_0.26.1
## [23] BiocGenerics_0.34.0 clues_0.6.2.2
## [25] dendextend_1.13.4 knitr_1.28
## [27] BiocManager_1.30.10
##
## loaded via a namespace (and not attached):
## [1] uuid_0.1-4 aroma.light_3.18.0 BiocFileCache_1.12.1
## [4] NMF_0.22.0 plyr_1.8.6 lazyeval_0.2.2
## [7] splines_4.0.0 rncl_0.8.4 ggplot2_3.3.1
## [10] gridBase_0.4-7 digest_0.6.25 foreach_1.5.0
## [13] htmltools_0.4.0 viridis_0.5.1 magrittr_1.5
## [16] memoise_1.1.0 cluster_2.1.0 doParallel_1.0.15
## [19] limma_3.44.3 annotate_1.66.0 R.utils_2.9.2
## [22] askpass_1.1 prettyunits_1.1.1 jpeg_0.1-8.1
## [25] colorspace_1.4-1 blob_1.2.1 rappdirs_0.3.1
## [28] xfun_0.14 dplyr_1.0.0 crayon_1.3.4
## [31] RCurl_1.98-1.2 jsonlite_1.6.1 genefilter_1.70.0
## [34] phylobase_0.8.10 survival_3.1-12 iterators_1.0.12
## [37] ape_5.4 glue_1.4.1 registry_0.5-1
## [40] gtable_0.3.0 zlibbioc_1.34.0 kernlab_0.9-29
## [43] Rhdf5lib_1.10.1 HDF5Array_1.16.1 scales_1.1.1
## [46] DESeq_1.39.0 DBI_1.1.0 edgeR_3.30.3
## [49] rngtools_1.5 bibtex_0.4.2.2 Rcpp_1.0.4.6
## [52] viridisLite_0.3.0 xtable_1.8-4 progress_1.2.2
## [55] reticulate_1.16 bit_1.1-15.2 httr_1.4.1
## [58] RColorBrewer_1.1-2 ellipsis_0.3.1 pkgconfig_2.0.3
## [61] XML_3.99-0.3 R.methodsS3_1.8.0 dbplyr_1.4.4
## [64] locfit_1.5-9.4 softImpute_1.4 howmany_0.3-1
## [67] tidyselect_1.1.0 rlang_0.4.6 reshape2_1.4.4
## [70] AnnotationDbi_1.50.3 munsell_0.5.0 tools_4.0.0
## [73] generics_0.0.2 RSQLite_2.2.0 ade4_1.7-15
## [76] evaluate_0.14 stringr_1.4.0 yaml_2.2.1
## [79] bit64_0.9-7 purrr_0.3.4 nlme_3.1-148
## [82] R.oo_1.23.0 xml2_1.3.2 biomaRt_2.44.1
## [85] compiler_4.0.0 curl_4.3 png_0.1-7
## [88] tibble_3.0.1 geneplotter_1.66.0 RNeXML_2.4.4
## [91] stringi_1.4.6 highr_0.8 GenomicFeatures_1.40.1
## [94] RSpectra_0.16-0 lattice_0.20-41 vctrs_0.3.1
## [97] pillar_1.4.4 lifecycle_0.2.0 zinbwave_1.10.0
## [100] bitops_1.0-6 rtracklayer_1.48.0 R6_2.4.1
## [103] latticeExtra_0.6-29 hwriter_1.3.2 gridExtra_2.3
## [106] codetools_0.2-16 MASS_7.3-51.6 assertthat_0.2.1
## [109] rhdf5_2.32.2 openssl_1.4.1 pkgmaker_0.31.1
## [112] withr_2.2.0 GenomeInfoDbData_1.2.3 locfdr_1.1-8
## [115] hms_0.5.3 grid_4.0.0 tidyr_1.1.0
## [118] rmarkdown_2.2
knitr::knit_exit()